{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# colab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 141
    },
    "colab_type": "code",
    "id": "jviywGyWyKsA",
    "outputId": "3b941253-2874-4020-fdbb-bde7b947cea0"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting bert-tensorflow\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)\n",
      "\r",
      "\u001b[K     |████▉                           | 10kB 16.8MB/s eta 0:00:01\r",
      "\u001b[K     |█████████▊                      | 20kB 4.2MB/s eta 0:00:01\r",
      "\u001b[K     |██████████████▋                 | 30kB 5.9MB/s eta 0:00:01\r",
      "\u001b[K     |███████████████████▍            | 40kB 3.9MB/s eta 0:00:01\r",
      "\u001b[K     |████████████████████████▎       | 51kB 4.7MB/s eta 0:00:01\r",
      "\u001b[K     |█████████████████████████████▏  | 61kB 5.6MB/s eta 0:00:01\r",
      "\u001b[K     |████████████████████████████████| 71kB 4.0MB/s \n",
      "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from bert-tensorflow) (1.12.0)\n",
      "Installing collected packages: bert-tensorflow\n",
      "Successfully installed bert-tensorflow-1.0.1\n"
     ]
    }
   ],
   "source": [
    "!pip install bert-tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "hsZvic2YxnTz"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "import tensorflow_hub as hub\n",
    "import pickle\n",
    "import bert\n",
    "from bert import run_classifier\n",
    "from bert import optimization\n",
    "from bert import tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "NZM71PjIOF_I"
   },
   "outputs": [],
   "source": [
    "def pretty_print(result):\n",
    "    df = pd.DataFrame([result]).T\n",
    "    df.columns = [\"values\"]\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ZCYOL8HbIZu2"
   },
   "outputs": [],
   "source": [
    "def create_tokenizer_from_hub_module(bert_model_hub):\n",
    "  \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n",
    "  with tf.Graph().as_default():\n",
    "    bert_module = hub.Module(bert_model_hub)\n",
    "    tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n",
    "    with tf.Session() as sess:\n",
    "      vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
    "                                            tokenization_info[\"do_lower_case\"]])\n",
    "      \n",
    "  return bert.tokenization.FullTokenizer(\n",
    "      vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
    "\n",
    "def make_features(dataset, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN):\n",
    "    input_example = dataset.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n",
    "                                          text_a = x[DATA_COLUMN], \n",
    "                                          text_b = None, \n",
    "                                          label = x[LABEL_COLUMN]), axis = 1)\n",
    "    features = bert.run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
    "    return features\n",
    "\n",
    "def create_model(bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, labels,\n",
    "                 num_labels):\n",
    "  \"\"\"Creates a classification model.\"\"\"\n",
    "\n",
    "  bert_module = hub.Module(\n",
    "      bert_model_hub,\n",
    "      trainable=True)\n",
    "  bert_inputs = dict(\n",
    "      input_ids=input_ids,\n",
    "      input_mask=input_mask,\n",
    "      segment_ids=segment_ids)\n",
    "  bert_outputs = bert_module(\n",
    "      inputs=bert_inputs,\n",
    "      signature=\"tokens\",\n",
    "      as_dict=True)\n",
    "\n",
    "  # Use \"pooled_output\" for classification tasks on an entire sentence.\n",
    "  # Use \"sequence_outputs\" for token-level output.\n",
    "  output_layer = bert_outputs[\"pooled_output\"]\n",
    "\n",
    "  hidden_size = output_layer.shape[-1].value\n",
    "\n",
    "  # Create our own layer to tune for politeness data.\n",
    "  output_weights = tf.get_variable(\n",
    "      \"output_weights\", [num_labels, hidden_size],\n",
    "      initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
    "\n",
    "  output_bias = tf.get_variable(\n",
    "      \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n",
    "\n",
    "  with tf.variable_scope(\"loss\"):\n",
    "\n",
    "    # Dropout helps prevent overfitting\n",
    "    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n",
    "\n",
    "    logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n",
    "    logits = tf.nn.bias_add(logits, output_bias)\n",
    "    log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
    "\n",
    "    # Convert labels into one-hot encoding\n",
    "    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n",
    "\n",
    "    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n",
    "    # If we're predicting, we want predicted labels and the probabiltiies.\n",
    "    if is_predicting:\n",
    "      return (predicted_labels, log_probs)\n",
    "\n",
    "    # If we're train/eval, compute loss between predicted and actual label\n",
    "    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n",
    "    loss = tf.reduce_mean(per_example_loss)\n",
    "    return (loss, predicted_labels, log_probs)\n",
    "\n",
    "# model_fn_builder actually creates our model function\n",
    "# using the passed parameters for num_labels, learning_rate, etc.\n",
    "def model_fn_builder(bert_model_hub, num_labels, learning_rate, num_train_steps,\n",
    "                     num_warmup_steps):\n",
    "  \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
    "  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
    "    \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
    "\n",
    "    input_ids = features[\"input_ids\"]\n",
    "    input_mask = features[\"input_mask\"]\n",
    "    segment_ids = features[\"segment_ids\"]\n",
    "    label_ids = features[\"label_ids\"]\n",
    "\n",
    "    # mode = tf.estimator.ModeKeys.PREDICT\n",
    "\n",
    "    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n",
    "    \n",
    "    # TRAIN and EVAL\n",
    "    if not is_predicting:\n",
    "\n",
    "      (loss, predicted_labels, log_probs) = create_model(\n",
    "        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
    "\n",
    "      train_op = bert.optimization.create_optimizer(\n",
    "          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n",
    "\n",
    "      # Calculate evaluation metrics. \n",
    "      def metric_fn(label_ids, predicted_labels):\n",
    "        print(\"#########\\npredicted_labels\\n##########\")\n",
    "        print(predicted_labels)\n",
    "        print(\"#########\\npredicted_labels\\n##########\")\n",
    "        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n",
    "        f1_score = tf.contrib.metrics.f1_score(\n",
    "            label_ids,\n",
    "            predicted_labels)\n",
    "        auc = tf.metrics.auc(\n",
    "            label_ids,\n",
    "            predicted_labels)\n",
    "        recall = tf.metrics.recall(\n",
    "            label_ids,\n",
    "            predicted_labels)\n",
    "        precision = tf.metrics.precision(\n",
    "            label_ids,\n",
    "            predicted_labels) \n",
    "        true_pos = tf.metrics.true_positives(\n",
    "            label_ids,\n",
    "            predicted_labels)\n",
    "        true_neg = tf.metrics.true_negatives(\n",
    "            label_ids,\n",
    "            predicted_labels)   \n",
    "        false_pos = tf.metrics.false_positives(\n",
    "            label_ids,\n",
    "            predicted_labels)  \n",
    "        false_neg = tf.metrics.false_negatives(\n",
    "            label_ids,\n",
    "            predicted_labels)\n",
    "        return {\n",
    "            \"eval_accuracy\": accuracy,\n",
    "            \"f1_score\": f1_score,\n",
    "            \"auc\": auc,\n",
    "            \"precision\": precision,\n",
    "            \"recall\": recall,\n",
    "            \"true_positives\": true_pos,\n",
    "            \"true_negatives\": true_neg,\n",
    "            \"false_positives\": false_pos,\n",
    "            \"false_negatives\": false_neg\n",
    "        }\n",
    "\n",
    "      eval_metrics = metric_fn(label_ids, predicted_labels)\n",
    "\n",
    "      if mode == tf.estimator.ModeKeys.TRAIN:\n",
    "        return tf.estimator.EstimatorSpec(mode=mode,\n",
    "          loss=loss,\n",
    "          train_op=train_op)\n",
    "      else:\n",
    "          return tf.estimator.EstimatorSpec(mode=mode,\n",
    "            loss=loss,\n",
    "            eval_metric_ops=eval_metrics)\n",
    "    else:\n",
    "      (predicted_labels, log_probs) = create_model(\n",
    "        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
    "\n",
    "      predictions = {\n",
    "          'probabilities': log_probs,\n",
    "          'labels': predicted_labels\n",
    "      }\n",
    "      return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n",
    "\n",
    "  # Return the actual model function in the closure\n",
    "  return model_fn\n",
    "\n",
    "def estimator_builder(bert_model_hub, OUTPUT_DIR, SAVE_SUMMARY_STEPS, SAVE_CHECKPOINTS_STEPS, label_list, LEARNING_RATE, num_train_steps, num_warmup_steps, BATCH_SIZE):\n",
    "\n",
    "    # Specify outpit directory and number of checkpoint steps to save\n",
    "    run_config = tf.estimator.RunConfig(\n",
    "        model_dir=OUTPUT_DIR,\n",
    "        save_summary_steps=SAVE_SUMMARY_STEPS,\n",
    "        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)\n",
    "\n",
    "    model_fn = model_fn_builder(\n",
    "      bert_model_hub = bert_model_hub,\n",
    "      num_labels=len(label_list),\n",
    "      learning_rate=LEARNING_RATE,\n",
    "      num_train_steps=num_train_steps,\n",
    "      num_warmup_steps=num_warmup_steps)\n",
    "\n",
    "    estimator = tf.estimator.Estimator(\n",
    "      model_fn=model_fn,\n",
    "      config=run_config,\n",
    "      params={\"batch_size\": BATCH_SIZE})\n",
    "    return estimator, model_fn, run_config\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "IuMOGwFui4it"
   },
   "outputs": [],
   "source": [
    "def run_on_dfs(train, test, predict, DATA_COLUMN, LABEL_COLUMN, \n",
    "               MAX_SEQ_LENGTH = 128,\n",
    "               BATCH_SIZE = 32,\n",
    "               LEARNING_RATE = 2e-5,\n",
    "               NUM_TRAIN_EPOCHS = 3.0,\n",
    "               WARMUP_PROPORTION = 0.1,\n",
    "               SAVE_SUMMARY_STEPS = 100,\n",
    "               SAVE_CHECKPOINTS_STEPS = 500,\n",
    "               bert_model_hub = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"):\n",
    "\n",
    "    label_list = train[LABEL_COLUMN].unique().tolist()\n",
    "    \n",
    "    tokenizer = create_tokenizer_from_hub_module(bert_model_hub)\n",
    "\n",
    "    train_features = make_features(train, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)\n",
    "    test_features = make_features(test, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)\n",
    "\n",
    "    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n",
    "    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)\n",
    "\n",
    "    estimator, model_fn, run_config = estimator_builder(\n",
    "                                  bert_model_hub, \n",
    "                                  OUTPUT_DIR, \n",
    "                                  SAVE_SUMMARY_STEPS, \n",
    "                                  SAVE_CHECKPOINTS_STEPS, \n",
    "                                  label_list, \n",
    "                                  LEARNING_RATE, \n",
    "                                  num_train_steps, \n",
    "                                  num_warmup_steps, \n",
    "                                  BATCH_SIZE)\n",
    "\n",
    "    train_input_fn = bert.run_classifier.input_fn_builder(\n",
    "        features=train_features,\n",
    "        seq_length=MAX_SEQ_LENGTH,\n",
    "        is_training=True,\n",
    "        drop_remainder=False)\n",
    "\n",
    "    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n",
    "\n",
    "    test_input_fn = run_classifier.input_fn_builder(\n",
    "        features=test_features,\n",
    "        seq_length=MAX_SEQ_LENGTH,\n",
    "        is_training=False,\n",
    "        drop_remainder=False)\n",
    "\n",
    "    result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)\n",
    "\n",
    "# prediction\n",
    "    pred_example = predict.apply(lambda x: bert.run_classifier.InputExample(guid = \"\", \n",
    "                                          text_a = x['comment'], \n",
    "                                          text_b = None, \n",
    "                                          label = 0), axis = 1)\n",
    "    \n",
    "    pred_features = bert.run_classifier.convert_examples_to_features(pred_example, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
    "\n",
    "    pred_input_fn = run_classifier.input_fn_builder(features=pred_features,\n",
    "                             seq_length=MAX_SEQ_LENGTH,\n",
    "                             is_training=False,\n",
    "                             drop_remainder=False)\n",
    "                        \n",
    "    predictions = estimator.predict(input_fn=pred_input_fn)\n",
    "\n",
    "    return result_dict, estimator, predictions\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wpUHTA8LIZu_"
   },
   "outputs": [],
   "source": [
    "import random\n",
    "random.seed(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "qmP8MBvjIZvB"
   },
   "outputs": [],
   "source": [
    "OUTPUT_DIR = 'output'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "paozkzav26Q8"
   },
   "source": [
    "----- you just need to focus from here ------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "XSUjVoFtJVEO"
   },
   "source": [
    "## Get your data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "XFBwDlmnIZvD"
   },
   "outputs": [],
   "source": [
    "# with open(\"dianping_train_test.pickle\", 'rb') as f:\n",
    "#     train, test = pickle.load(f)\n",
    "df = pd.read_csv('/content/train.csv', sep='\\t')\n",
    "train = df[:10000]\n",
    "test = df[8000:10000]\n",
    "# test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "9TsJrBA0KkOL"
   },
   "outputs": [],
   "source": [
    "pred_df = pd.read_csv('/content/test_new.csv', sep=',')\n",
    "# pred_comments = list(pred_df['comment'])\n",
    "# pred_comments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Tju0c4dqIZvK"
   },
   "outputs": [],
   "source": [
    "train = train.sample(len(train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "hM8M7k0gKk5H"
   },
   "outputs": [],
   "source": [
    "myparam = {\n",
    "        \"DATA_COLUMN\": \"comment\",\n",
    "        \"LABEL_COLUMN\": \"sentiment\",\n",
    "        \"LEARNING_RATE\": 2e-5,\n",
    "        \"NUM_TRAIN_EPOCHS\":3,\n",
    "        \"bert_model_hub\":\"https://tfhub.dev/google/bert_chinese_L-12_H-768_A-12/1\"\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Dg2apeXpMqG-"
   },
   "outputs": [],
   "source": [
    "result, estimator, predictions = run_on_dfs(train, test, pred_df, **myparam)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "YLOvtveTMqwp"
   },
   "outputs": [],
   "source": [
    "pretty_print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "UMaHp2j24unE"
   },
   "outputs": [],
   "source": [
    "pred_result = []\n",
    "for prediction in predictions:\n",
    "    pred_result.append(prediction)\n",
    "print(pred_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "nTU7oRV-DQXw"
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "ZQB1QUcbAcoa",
    "outputId": "5ca52b43-cc03-4c79-a1c7-33708f5c1c31"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2000"
      ]
     },
     "execution_count": 30,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(pred_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "XYrT27Plenbt"
   },
   "outputs": [],
   "source": [
    "with open('/content/predictions.csv', 'w') as f:\n",
    "#   for prediction in predictions:\n",
    "#       f.writelines(str(prediction) + \"\\n\")\n",
    "  for i in range(0, 2000):\n",
    "    f.write(str(pred_result[i]['labels']) + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "NLP-bert-try.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
